Higgs dataset experiment

Researcher: Kirell Benzi, Benjamin Ricaud, Pierre Vandergheynst

From the paper: Principal Patterns on Graph

The dataset is available at SNAP.


In [15]:
import os
import time
import numpy as np
import scipy as sp
import pandas as pd
import networkx as nx
import datetime as dt
import itertools
import operator
import community
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import Counter

import IPython.utils.path
DATA_DIR = os.path.join(IPython.utils.path.get_home_dir(), 'data/higgs/')
print 'Data directory:', DATA_DIR
dataset_name = 'higgs'


# Install this from https://github.com/kikohs/sptgraph
import sptgraph
%load_ext autoreload
%autoreload 2


# Customize plot colors for dark backgrounds
%matplotlib inline
mpl.rcParams['axes.edgecolor'] = 'grey'
mpl.rcParams['grid.color'] = '#66CCCC'
mpl.rcParams['text.color'] = '#0EBFE9'
mpl.rcParams['xtick.color'] = '#66CCCC'
mpl.rcParams['ytick.color'] = '#66CCCC'
mpl.rcParams['axes.labelcolor'] = '#0EBFE9'


Data directory: /home/ubuntu/data/higgs/
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Optionnal use bokeh to plot interactive components


In [ ]:
from bokeh.charts import Bar, show
from bokeh.plotting import output_notebook
output_notebook()

In [ ]:
def plot_spatial_spread_graph(comp):
    nodes = sorted(comp.nodes(data=True), key=lambda x: x[1]['layer'])
    group_hist = []
    layers = []
    for layer, n in itertools.groupby(nodes, key=lambda x: x[1]['layer']):
        elems = list(n)
        # Add timestamp instead of layer
        # Bug in bokehJs, cannot use colons ...
        t = elems[0][1]['timestamp'].strftime('%d-%m-%Y %H\'%M\'%S')
        layers.append(t)
        group_hist.append(len(elems))
    
    bar = Bar(np.array(group_hist), layers, title="Spatial spread of component " + str(comp.name), stacked=False)
    return bar

Helper functions


In [3]:
def round_df(df, freq, unit):
    mul = 1
    if unit == 'min':
        mul = 60
        unit = 'Min'

    elif unit == 'hour':
        mul = 60 * 60
        unit = 'H'

    elif unit == 'day':
        mul = 60 * 60 * 24
        unit = 'D'
    else:
        mul = 1
        unit = 'S'

    ns_min = freq * mul * 1000000000
    idx = pd.DatetimeIndex(((df.index.astype(np.int64) // ns_min) * ns_min))
    idx.name = df.index.name
    df.index = idx
    return df, unit


def create_layers(df, freq, unit):
    # Trick: use pandas resample to generate continuous indexes
    dfR = df.resample(str(freq) + unit) 
    layer_map = dict(itertools.izip(dfR.index.astype(np.int64), itertools.count()))
    df['layer'] = np.vectorize(lambda x: layer_map[x])(df.index.astype(np.int64))
    # delta between layers (timedelta)
    delta = dfR.index[1] - dfR.index[0]
    return df, delta
    
    
def create_layered_graph(input_df, freq=1, unit='min', action='RT', social_graph=None):
        
    h = nx.DiGraph()
    h.name = 'Higgs layered ' + action + ' - ' + str(freq) + ' ' + unit
    
    df = input_df.copy()
    if action != '' and action is not None:
        df = df[df['action'] == action]
    
    df, unit = round_df(df, freq, unit)
    df, delta_ts = create_layers(df, freq, unit)
    # Maximum id in whole dataframe, min should not be 0
    max_id = np.max(np.max(df[['src_id', 'tgt_id']]).values)
    
    for idx, row in df.iterrows():
        base_src = row['src_id']
        base_tgt = row['tgt_id']
        layer = row['layer']
        act = row['action']
              
        src = base_src + (layer * max_id)
        tgt = base_tgt + ((layer + 1) * max_id)
        
        # Add nodes
        h.add_node(src, {'base_id': base_src, 'layer': layer, 'timestamp': idx})
        h.add_node(tgt, {'base_id': base_tgt, 'layer': layer + 1, 'timestamp': idx + delta_ts})
        
        # Add edge
        e_d = {'action': act, 'timestamp': idx}
        if social_graph is not None:
            if social_graph.has_edge(base_src, base_tgt):
                e_d['is_social'] = True
            else:
                e_d['is_social'] = False
            
        h.add_edge(src, tgt, e_d)
        
    return h, df


def extract_components(h):
    components = filter(lambda x: x.number_of_edges() >= 1 and x.number_of_nodes() >= 2, 
                    nx.weakly_connected_component_subgraphs(h))
    res = []
    for i, comp in enumerate(components):
        comp.name = i
        res.append({'component': comp})
        
    df = pd.DataFrame(res)
    df.index.name = 'component_id'
    return df


def enrich_components(df, with_social=False):
    
    def get_period_span(c):
        ts = sorted(nx.get_node_attributes(c, 'timestamp').values())
        return (ts[0], ts[-1])
    
    def get_social_edge_ratio(c):
        social_edges = sum(nx.get_edge_attributes(c, 'is_social').values())  # True == 1, False 0
        return social_edges / float(c.number_of_edges())
        
    df['node_count'] = df['component'].apply(lambda x: x.number_of_nodes())
    df['edge_count'] = df['component'].apply(lambda x: x.number_of_edges())
    df['height'] = df['component'].apply(lambda x: len(np.unique(nx.get_node_attributes(x, 'base_id').values())))
    df['width'] = df['component'].apply(lambda x: len(np.unique(nx.get_node_attributes(x, 'layer').values())))
    period_series = df['component'].apply(get_period_span)
    df['start'] = period_series.apply(lambda x: x[0])
    df['end'] = period_series.apply(lambda x: x[1])
    df['social_ratio'] = df['component'].apply(get_social_edge_ratio)
    
    return df.sort('node_count', ascending=False)


def create_activated_components(input_df, freq=1, unit='min', action='RT', social_graph=None):
    h, _ = create_layered_graph(input_df, freq, unit, action, social_graph)
    comp_df = extract_components(h)
    with_social = True if social_graph is not None else False
    return enrich_components(comp_df, with_social)


def create_graph_from_activity(activity_df, action='RT'):
    g = nx.DiGraph()
    df = activity_df[activity_df['action'] == action]
    g.name = 'Higgs ' + action
    for idx, d in df.iterrows():
        src = d['src_id']
        tgt = d['tgt_id']
        if not g.has_edge(src, tgt):
            g.add_edge(src, tgt, weight=1)
        else:
            g[src][tgt]['weight'] += 1

    return g


def overlap_graph(g1, g2):
    common_edges = 0
    for u, v in g1.edges_iter():
        if g2.has_edge(u, v):
            common_edges += 1

    res = common_edges * 100 / float(nx.number_of_edges(g1))
    print 'Percentage of overlap (', g1.name, ',', g2.name, '):', res
    print 'Number of common edges:', common_edges
    return res, common_edges


def parse_activity(path, reverse=True):
    """if reverse is True. edges to keep the causality from cause to effet"""
    names = ['src_id', 'tgt_id', 'timestamp', 'action']
    if reverse:
        names = ['tgt_id', 'src_id', 'timestamp', 'action']
        
    df = pd.read_csv(path, sep=' ', header=None, names=names,
                     dtype={'src_id': np.int64, 'tgt_id': np.int64, 'timestamp': np.int64, 'action': str},
                     index_col=2)
    
    df['action'] = df['action'].astype(str)
    df.index = df.index.astype('datetime64[s]')
    df.index.name = 'timestamp'
    return df

Parse data

Parse retweet graph


In [ ]:
start = time.time()
# path = os.path.join(DATA_DIR, 'HiggsDiscovery_RT.edges.gz')
# RETWEET = nx.read_edgelist(path, create_using=nx.DiGraph(),
#                            nodetype=int, data=(('weight', int),))
# RETWEET.name = 'Higgs RT'
# nx.write_gpickle(RETWEET, os.path.join(DATA_DIR, 'retweet.gpickle'))
RETWEET = nx.read_gpickle(os.path.join(DATA_DIR, 'retweet.gpickle'))
print 'Retweet graph loaded in:', time.time() - start
print nx.info(RETWEET)

Parse mention graph


In [ ]:
start = time.time()
# MENTION = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_MT.edges.gz'), 
#                            create_using=nx.DiGraph(), nodetype=int, data=(('weight', int),))
# MENTION.name = 'Higgs MT'
# nx.write_gpickle(MENTION, os.path.join(DATA_DIR, 'mention.gpickle'))

MENTION = nx.read_gpickle(os.path.join(DATA_DIR, 'mention.gpickle'))
print 'Mention graph loaded in:', time.time() - start
print nx.info(MENTION)

Parse reply graph


In [ ]:
start = time.time()
# REPLY = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_RE.edges.gz'),
#                          create_using=nx.DiGraph(), nodetype=int, data=(('weight', int),))
# REPLY.name = 'Higgs RE'
# nx.write_gpickle(REPLY, os.path.join(DATA_DIR, 'reply.gpickle'))

REPLY = nx.read_gpickle(os.path.join(DATA_DIR, 'reply.gpickle'))
print 'Reply graph loaded in:', time.time() - start
print nx.info(REPLY)

Parse social network (follower network)


In [ ]:
start = time.time()
# SOCIAL = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_social.edges.gz'),
#                           create_using=nx.DiGraph(), nodetype=int, )
# SOCIAL.name = 'Higgs SOCIAL'
# nx.write_gpickle(SOCIAL, os.path.join(DATA_DIR, 'social.gpickle'))

SOCIAL = nx.read_gpickle(os.path.join(DATA_DIR, 'social.gpickle'))
print 'Social graph loaded in:', time.time() - start
print nx.info(SOCIAL)

In [4]:
start = time.time()
# SOCIAL_RE = nx.reverse(SOCIAL)
# SOCIAL_RE.name = 'Higgs SOCIAL reversed'
# nx.write_gpickle(SOCIAL_RE, os.path.join(DATA_DIR, 'social_re.gpickle'))
SOCIAL_RE = nx.read_gpickle(os.path.join(DATA_DIR, 'social_re.gpickle'))
print 'Social reversed graph loaded in:', time.time() - start
print nx.info(SOCIAL_RE)


Social reversed graph loaded in: 86.8898489475
Name: Higgs SOCIAL reversed
Type: DiGraph
Number of nodes: 456626
Number of edges: 14855842
Average in degree:  32.5339
Average out degree:  32.5339

Analysis

  • Period I: Before the announcement on 2nd July, there were some rumors about the discovery of a Higgs-like boson at Tevatron;
  • Period II: On 2nd July at 1 PM GMT, scientists from CDF and D0 experiments, based at Tevatron, presented results indicating that the Higgs particle should have a mass between 115 and 135 GeV/c2 (corresponding to about 123-144 times the mass of the proton) [7];
  • Period III: After 2nd July and before 4th of July there were many rumors about the Higgs boson dis- covery at LHC [8];
  • Period IV: The main event was the announce- ment on 4th July at 8 AM GMT by the scientists from the ATLAS and CMS experiments, based at CERN, presenting results indicating the existence of a new particle, compatible with the Higgs bo- son, with mass around 125 GeV/c2 [9, 10]. After 4th July, popular media covered the event.

Extract causal multilayer graph activated components

The graph is already given by the activity. Thus we don't use our framework is this special case.


In [5]:
ACTIVITY = parse_activity(os.path.join(DATA_DIR, 'HiggsDiscovery_multiplex_time.txt'), reverse=True)
# COMPS = create_activated_components(ACTIVITY, 10, 'min', 'RT', SOCIAL_RE)

Filter components and extract global statistics


In [ ]:
FILT_COMPS = COMPS[COMPS['node_count'] > 20]  # filter too small components
C = FILT_COMPS.set_index('start', drop=False).sort_index()
# Resample do get statistics (averages the values)
C.resample('12H')

In [ ]:
fig = plot_spatial_spread(FILT_COMPS.iloc[0]['component'])
show(fig)

Create "classic" causal multilayer with our framework


In [19]:
def create_signal(input_df, freq=1, unit='min', action='RT', subset=None):
    df = input_df.copy()
    if action != '' and action is not None:
        df = df[df['action'] == action]
        
    df, unit = round_df(df, freq, unit)
    df, delta_ts = create_layers(df, freq, unit)
    df.reset_index(drop=False, inplace=True)

    res = None
    if subset is None:
        src_df = df[['src_id', 'layer', 'timestamp']].rename(columns = {'src_id':'base_id'})
        tgt_df = df[['tgt_id', 'layer', 'timestamp']].rename(columns = {'tgt_id':'base_id'})

        tgt_df['layer'] = tgt_df['layer'].apply(lambda x: x + 1)
        tgt_df['timestamp'] = tgt_df['timestamp'].apply(lambda x: x + delta_ts)

        res = pd.concat([src_df, tgt_df], ignore_index=True) \
                    .sort('layer').reset_index(drop=True).drop_duplicates()
    
    else:
        subset_df = df[[subset, 'layer', 'timestamp']].rename(columns = {subset: 'base_id'})
        res = subset_df.sort('layer').reset_index(drop=True).drop_duplicates()
    
    return res


def get_period_span_gl(df, signal):
    
    def impl(row):
        min_layer = layer_to_ts[np.min(row)]
        max_layer = layer_to_ts[np.max(row)]
        return [min_layer, max_layer]
    
    layer_to_ts = dict(signal[['layer', 'timestamp']].values)
    
    period_series = df['layers'].apply(impl)
    df['start'] = period_series.apply(lambda x: x[0])
    df['end'] = period_series.apply(lambda x: x[1])
    return df


def extract_components_gl(signal, graph, create_self_edge=True):
    node_signal = sptgraph.utils.from_pandas(signal)
    h, max_id = sptgraph.create_spatio_temporal_graph(graph, node_signal, create_self_edge, 'base_id', 'layer')
    h = sptgraph.find_connected_components(h)
    comps = sptgraph.get_component_sframe(h, 'base_id', 'layer')
    return get_period_span_gl(comps, signal), h, max_id

In [8]:
SOCIAL_GL_RE = sptgraph.utils.networkx_to_graphlab(SOCIAL_RE)

First scale 10 min RT

Using only tgt_id as activation


In [20]:
signal_10min_rt_tgt = create_signal(ACTIVITY, 10, 'min', 'RT', subset='tgt_id')
gl_comps_10min_rt_tgt, H, max_id = extract_components_gl(signal_10min_rt_tgt, SOCIAL_GL_RE)
layer_to_ts = dict(signal_10min_rt_tgt[['layer', 'timestamp']].values)


Create spatio-temporal graph
Create node signal
Create node signal done in: 2.12166500092 seconds
Create signal graph
Create signal graph done in: 27.1102449894 seconds
Create spatio-temporal graph done in: 63.8228449821 seconds

In [21]:
gl_comps_10min_rt_tgt[['component_id', 'node_count', 'width', 'height', 'start', 'end']]


Out[21]:
component_id node_count width height start end
650 55037 108 36800 2012-07-04 03:10:00+00:00 2012-07-04 21:00:00+00:00
483 357 15 324 2012-07-03 17:40:00+00:00 2012-07-03 20:00:00+00:00
9736 299 12 277 2012-07-05 11:30:00+00:00 2012-07-05 13:20:00+00:00
9398 254 12 231 2012-07-05 05:00:00+00:00 2012-07-05 06:50:00+00:00
9113 244 9 235 2012-07-05 00:00:00+00:00 2012-07-05 01:20:00+00:00
138 232 12 212 2012-07-02 16:20:00+00:00 2012-07-02 18:10:00+00:00
551 200 14 163 2012-07-03 21:00:00+00:00 2012-07-03 23:10:00+00:00
7705 169 5 107 2012-07-04 14:40:00+00:00 2012-07-04 15:20:00+00:00
9688 166 9 160 2012-07-05 10:30:00+00:00 2012-07-05 11:50:00+00:00
8932 142 9 128 2012-07-04 20:50:00+00:00 2012-07-04 22:10:00+00:00
... ... ... ... ... ...
[14006 rows x 6 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Plot spatial spread of biggest component trough time


In [11]:
big_comp = gl_comps_10min_rt_tgt[0]

In [22]:
def plot_component_spatial_spread(comp, layer_to_ts, size=(10, 10), savepath=None):
    hist = Counter(sorted(big_comp['layers']))
    x, y = zip(*sorted(hist.items(), key=operator.itemgetter(0)))
    
    comp_time = map(lambda t: (t, layer_to_ts[t]), x) 
    # Get xticks by hour
    xticks, labels = zip(*[(k, v.strftime('%H-%M')) for (k, v) in comp_time if v.minute == 0])

    fig = plt.figure(0)
    fig.set_size_inches(size)
    ax = plt.plot(x, y)
    plt.xticks(xticks, labels, rotation=45)
    
    plt.xlabel('Time')
    plt.ylabel('Number of retweets')
    plt.title('Spatial spread of component {}'.format(comp['component_id']))
    plt.show()

# plot_component_spatial_spread(big_comp, layer_to_ts)

Extract the biggest component subgraph for further analysis


In [92]:
g = sptgraph.component_to_networkx(big_comp, H, layer_to_ts)
print nx.info(g)


Name: Component 650
Type: DiGraph
Number of nodes: 55037
Number of edges: 106895
Average in degree:   1.9422
Average out degree:   1.9422

In [80]:
g_un = g.to_undirected()
partition = community.best_partition(g_un)

In [81]:
print 'Number of communities:', len(set(partition.values()))


Number of communities: 166

In [82]:
print 'Modularity:', community.modularity(partition, g_un)


Modularity: 0.829348352906

Export component from plotting using Gephi


In [93]:
path = os.path.join(DATA_DIR, g.name)
# nx.write_edgelist(g, path + '.csv', data=False)
nx.write_graphml(g, path + '.graphml')

Another scale 1min RT

Using only tgt_id


In [ ]:
signal_1min_rt_tgt = create_signal(ACTIVITY, 1, 'min', 'RT', subset='tgt_id')
gl_comps_1min_rt_tgt = extract_components_gl(signal_1min_rt_tgt, SOCIAL_GL_RE)

In [ ]:
gl_comps_1min_rt_tgt[['component_id', 'node_count', 'width', 'height', 'start', 'end']]

Overlap between retweet and social network


In [ ]:
def overlap_graph(g1, g2):
    common_edges = 0
    for u, v in g1.edges_iter():
        if g2.has_edge(u, v):
            common_edges += 1

    res = common_edges * 100 / float(nx.number_of_edges(g1))
    print 'Percentage of overlap (', g1.name, ',', g2.name, '):', res
    print 'Number of common edges:', common_edges
    return res, common_edges

# overlap_graph(REPLY, SOCIAL)
overlap_graph(RETWEET, SOCIAL)
# overlap_graph(MENTION, SOCIAL)